In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
/Applications/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2)
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of "
In [2]:
df=pd.read_csv("/Users/abelabykuriakose/downloads/transaction_anomalies_dataset.csv")
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction_ID               1000 non-null   object 
 1   Transaction_Amount           1000 non-null   float64
 2   Transaction_Volume           1000 non-null   int64  
 3   Average_Transaction_Amount   1000 non-null   float64
 4   Frequency_of_Transactions    1000 non-null   int64  
 5   Time_Since_Last_Transaction  1000 non-null   int64  
 6   Day_of_Week                  1000 non-null   object 
 7   Time_of_Day                  1000 non-null   object 
 8   Age                          1000 non-null   int64  
 9   Gender                       1000 non-null   object 
 10  Income                       1000 non-null   int64  
 11  Account_Type                 1000 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 93.9+ KB
In [4]:
df.columns
Out[4]:
Index(['Transaction_ID', 'Transaction_Amount', 'Transaction_Volume',
       'Average_Transaction_Amount', 'Frequency_of_Transactions',
       'Time_Since_Last_Transaction', 'Day_of_Week', 'Time_of_Day', 'Age',
       'Gender', 'Income', 'Account_Type'],
      dtype='object')
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction_ID               1000 non-null   object 
 1   Transaction_Amount           1000 non-null   float64
 2   Transaction_Volume           1000 non-null   int64  
 3   Average_Transaction_Amount   1000 non-null   float64
 4   Frequency_of_Transactions    1000 non-null   int64  
 5   Time_Since_Last_Transaction  1000 non-null   int64  
 6   Day_of_Week                  1000 non-null   object 
 7   Time_of_Day                  1000 non-null   object 
 8   Age                          1000 non-null   int64  
 9   Gender                       1000 non-null   object 
 10  Income                       1000 non-null   int64  
 11  Account_Type                 1000 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 93.9+ KB
In [6]:
df.duplicated().sum()
Out[6]:
0
In [7]:
df.isnull().sum()
Out[7]:
Transaction_ID                 0
Transaction_Amount             0
Transaction_Volume             0
Average_Transaction_Amount     0
Frequency_of_Transactions      0
Time_Since_Last_Transaction    0
Day_of_Week                    0
Time_of_Day                    0
Age                            0
Gender                         0
Income                         0
Account_Type                   0
dtype: int64
In [8]:
df
Out[8]:
Transaction_ID Transaction_Amount Transaction_Volume Average_Transaction_Amount Frequency_of_Transactions Time_Since_Last_Transaction Day_of_Week Time_of_Day Age Gender Income Account_Type
0 TX0 1024.835708 3 997.234714 12 29 Friday 06:00 36 Male 1436074 Savings
1 TX1 1013.952065 4 1020.210306 7 22 Friday 01:00 41 Female 627069 Savings
2 TX2 970.956093 1 989.496604 5 12 Tuesday 21:00 61 Male 786232 Savings
3 TX3 1040.822254 2 969.522480 16 28 Sunday 14:00 61 Male 619030 Savings
4 TX4 998.777241 1 1007.111026 7 7 Friday 08:00 56 Female 649457 Savings
... ... ... ... ... ... ... ... ... ... ... ... ...
995 TX995 901.138758 3 976.363229 17 19 Monday 05:00 46 Female 424746 Savings
996 TX996 928.962516 4 1028.292292 10 25 Wednesday 09:00 58 Female 908278 Current
997 TX997 950.921600 2 1022.823424 13 28 Friday 02:00 58 Female 1353498 Current
998 TX998 933.291962 4 994.325450 10 8 Tuesday 04:00 58 Female 359072 Savings
999 TX999 968.289340 3 979.078420 18 10 Sunday 10:00 31 Female 1101680 Current

1000 rows × 12 columns

In [9]:
fig_amount = px.histogram(df, x='Transaction_Amount',
                          nbins=30,
                          title='Distribution of Transaction Amount')
fig_amount.show()
In [10]:
fig_box_amount = px.box(df,
                        x='Account_Type',
                        y='Transaction_Amount',
                        title='Transaction Amount by Account Type')
fig_box_amount.show()
In [11]:
fig_scatter_avg_amount_age = px.scatter(df, x='Age',
                                        y='Average_Transaction_Amount',
                                        color='Account_Type',
                                        title='Average Transaction Amount vs. Age'
                                        )
fig_scatter_avg_amount_age.show()
In [50]:
import seaborn as sns
import matplotlib.pyplot as plt
In [53]:
corr=df.corr()
fig_corr_heatmap=sns.heatmap(corr, annot=True, cmap='coolwarm')

plt.show()
<ipython-input-53-8f4f8f841fe1>:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [12]:
mean_amount=df['Transaction_Amount'].mean()
std_amount = df['Transaction_Amount'].std()
print(mean_amount)
print(std_amount)
1038.1225107502053
283.5800547153556
In [33]:
anomaly_threshold=mean_amount+2*std_amount
anomaly_threshold
Out[33]:
1605.2826201809164
In [34]:
df['Is_Anomaly'] = df['Transaction_Amount'] > anomaly_threshold
color_map = {True: 'red', False: 'Green'}

fig_anomalies = px.scatter(df, x='Transaction_Amount', y='Average_Transaction_Amount',
                           color='Is_Anomaly', color_discrete_map=color_map,
                           title='Anomalies in Transaction Amount')
df['Color'] = df['Is_Anomaly'].map(color_map)

# Update the marker size for better visibility
fig_anomalies.update_traces(marker=dict(size=12), 
                            selector=dict(mode='markers', marker_size=1))

# Show the plot
fig_anomalies.show()
In [36]:
num_anomalies=df['Is_Anomaly'].sum()
num_anomalies
Out[36]:
20
In [39]:
total_instances = df.shape[0]
# Calculate the ratio of anomalies
anomaly_ratio = num_anomalies / total_instances
print(anomaly_ratio)
0.02
In [40]:
relevant_features = ['Transaction_Amount',
                     'Average_Transaction_Amount',
                     'Frequency_of_Transactions']
In [43]:
# Split data into features (X) and target variable (y)
X = df[relevant_features]
y = df['Is_Anomaly']
In [44]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [45]:
# Train the Isolation Forest model
model = IsolationForest(contamination=0.02, random_state=42)
model.fit(X_train)
Out[45]:
IsolationForest(contamination=0.02, random_state=42)
In [46]:
y_pred = model.predict(X_test)

# Convert predictions to binary values (0: normal, 1: anomaly)
y_pred_binary = [1 if pred == -1 else 0 for pred in y_pred]

# Evaluate the model's performance
report = classification_report(y_test, y_pred_binary, target_names=['Normal', 'Anomaly'])
print(report)
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00       196
     Anomaly       1.00      1.00      1.00         4

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

In [47]:
# Relevant features used during training
relevant_features = ['Transaction_Amount', 'Average_Transaction_Amount', 'Frequency_of_Transactions']

# Get user inputs for features
user_inputs = []
for feature in relevant_features:
    user_input = float(input(f"Enter the value for '{feature}': "))
    user_inputs.append(user_input)

# Create a DataFrame from user inputs
user_df = pd.DataFrame([user_inputs], columns=relevant_features)

# Predict anomalies using the model
user_anomaly_pred = model.predict(user_df)

# Convert the prediction to binary value (0: normal, 1: anomaly)
user_anomaly_pred_binary = 1 if user_anomaly_pred == -1 else 0

if user_anomaly_pred_binary == 1:
    print("Anomaly detected: This transaction is flagged as an anomaly.")
else:
    print("No anomaly detected: This transaction is normal.")
Enter the value for 'Transaction_Amount': 2500
Enter the value for 'Average_Transaction_Amount': 2566
Enter the value for 'Frequency_of_Transactions': 34
Anomaly detected: This transaction is flagged as an anomaly.
In [ ]: